{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def reduce_mem_usage(df):\n",
    "    \"\"\" iterate through all the columns of a dataframe and modify the data type\n",
    "        to reduce memory usage.        \n",
    "    \"\"\"\n",
    "    start_mem = df.memory_usage().sum() \n",
    "    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n",
    "    \n",
    "    for col in df.columns:\n",
    "        col_type = df[col].dtype\n",
    "        \n",
    "        if col_type != object:\n",
    "            c_min = df[col].min()\n",
    "            c_max = df[col].max()\n",
    "            if str(col_type)[:3] == 'int':\n",
    "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
    "                    df[col] = df[col].astype(np.int8)\n",
    "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
    "                    df[col] = df[col].astype(np.int16)\n",
    "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
    "                    df[col] = df[col].astype(np.int32)\n",
    "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
    "                    df[col] = df[col].astype(np.int64)  \n",
    "            else:\n",
    "                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
    "                    df[col] = df[col].astype(np.float16)\n",
    "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
    "                    df[col] = df[col].astype(np.float32)\n",
    "                else:\n",
    "                    df[col] = df[col].astype(np.float64)\n",
    "        else:\n",
    "            df[col] = df[col].astype('category')\n",
    "\n",
    "    end_mem = df.memory_usage().sum() \n",
    "    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n",
    "    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n",
    "    \n",
    "    return df\n",
    "\n",
    "def load_data(path):\n",
    "    user = reduce_mem_usage(pd.read_csv(path + 'user.csv',header=None))\n",
    "    item = reduce_mem_usage(pd.read_csv(path + 'item.csv',header=None))\n",
    "    data = pd.read_csv(path + 'user_behavior.csv',header=None)\n",
    "\n",
    "    data.columns = ['userID','itemID','behavior','timestamp']\n",
    "    data['day'] = data['timestamp'] // 86400\n",
    "    data['hour'] = data['timestamp'] // 3600 % 24\n",
    "    \n",
    "    ## 生成behavior的onehot\n",
    "    for i in ['pv','fav','cart','buy']:\n",
    "        data[i] = 0\n",
    "        data.loc[data['behavior'] == i, i] = 1\n",
    "\n",
    "    ## 生成behavior的加权\n",
    "    \n",
    "    data['day_hour'] = data['day'] + data['hour'] / float(24)\n",
    "    data.loc[data['behavior']=='pv','behavior'] = 1\n",
    "    data.loc[data['behavior']=='fav','behavior'] = 2\n",
    "    data.loc[data['behavior']=='cart','behavior'] = 3\n",
    "    data.loc[data['behavior']=='buy','behavior'] = 1\n",
    "    max_day = max(data['day'])\n",
    "    min_day = min(data['day'])\n",
    "    data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] \n",
    "\n",
    "    item.columns = ['itemID','category','shop','brand']\n",
    "    user.columns = ['userID','sex','age','ability']\n",
    "    \n",
    "    data = reduce_mem_usage(data)\n",
    "\n",
    "    data = pd.merge(left=data, right=item, on='itemID',how='left')\n",
    "    data = pd.merge(left=data, right=user, on='userID',how='left')\n",
    "\n",
    "    return user, item, data\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 44702560.00 MB\n",
      "Memory usage after optimization is: 9778785.00 MB\n",
      "Decreased by 78.1%\n",
      "Memory usage of dataframe is 138182592.00 MB\n",
      "Memory usage after optimization is: 60454956.00 MB\n",
      "Decreased by 56.2%\n",
      "Memory usage of dataframe is 7081839904.00 MB\n",
      "Memory usage after optimization is: 1770460072.00 MB\n",
      "Decreased by 75.0%\n"
     ]
    }
   ],
   "source": [
    "path = '../ECommAI_EUIR_round2_train_20190816/'\n",
    "\n",
    "user, item, data = load_data(path = path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for count_feature in ['itemID', 'shop', 'category','brand']:\n",
    "    data[['behavior', count_feature]].groupby(count_feature, as_index=False).agg(\n",
    "        {'behavior':'count'}).rename(columns={'behavior':count_feature + '_count'}).to_csv(str(count_feature)+'_count.csv', index=False)\n",
    "\n",
    "for count_feature in ['itemID', 'shop', 'category','brand']:\n",
    "    data[['behavior', count_feature]].groupby(count_feature, as_index=False).agg(\n",
    "        {'behavior':'sum'}).rename(columns={'behavior':count_feature + '_sum'}).to_csv(str(count_feature)+'_sum.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = data[['behavior','category']].groupby('category', as_index=False).agg({'behavior': ['median','std','skew']})\n",
    "temp.columns = ['category','category_median','category_std','category_skew']\n",
    "\n",
    "temp.to_csv('category_higher.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = data[['behavior','itemID']].groupby('itemID', as_index=False).agg({'behavior': ['median','std','skew']})\n",
    "temp.columns = ['itemID','itemID_median','itemID_std','itemID_skew']\n",
    "\n",
    "temp.to_csv('itemID_higher.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['age'] = data['age'] // 10\n",
    "train = data[data['day'] < 15]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 注意 这个要生成一个underline版本和一个online版本\n",
    "for count_feature in ['sex','ability','age']:\n",
    "    data[['behavior','itemID',count_feature]].groupby(['itemID', count_feature], as_index=False).agg(\n",
    "        {'behavior': 'count'}).rename(columns={'behavior':'user_to_'\n",
    "                                               + count_feature + '_count'}).to_csv('item_to_' + str(count_feature)+'_count_online.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "itemcount = pd.read_csv('itemID_count.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = pd.merge(left=item, right=itemcount, how='left', on='itemID')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_rank = []\n",
    "for eachcat in temp.groupby('category'):\n",
    "    each_df = eachcat[1].sort_values('itemID_count', ascending=False).reset_index(drop=True)\n",
    "    each_df['rank'] = each_df.index + 1\n",
    "    lenth = each_df.shape[0]\n",
    "    each_df['rank_percent'] = (each_df.index + 1) / lenth\n",
    "    item_rank.append(each_df[['itemID','rank','rank_percent']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_rank = pd.concat(item_rank, sort=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_rank.to_csv('item_rank.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def unique_count(x):\n",
    "    return len(set(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat1 = item.groupby('category',as_index=False).agg({'itemID': unique_count}).rename(columns={'itemID':'itemnum_undercat'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat2 = item.groupby('category',as_index=False).agg({'brand': unique_count}).rename(columns={'brand':'brandnum_undercat'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat3 = item.groupby('category',as_index=False).agg({'shop': unique_count}).rename(columns={'shop':'shopnum_undercat'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.concat([cat1, cat2[['brandnum_undercat']], cat3[['shopnum_undercat']]], axis=1).to_csv('category_lower.csv',index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
